suppressPackageStartupMessages(library(tidyverse))
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
theme_set(
  theme_classic(base_size = 7) +
    theme(legend.position = 'bottom')
)

wd <- '~/Google Drive/My Drive/Analysis/METTL2A/'
setwd(wd)

figdir   <- paste0(wd, 'Figures/Coverage_CI/')
tabledir <- paste0(wd, 'Tables/Coverage_CI/')

Functions

paste_wd <- function(path) {
  paste0(wd, path)
}

Read data

espresso_deseq2_genetype2_isDET <- 
  read_tsv(
    'Tables/Espresso/espresso_deseq2_genetype2_isDET_2024-04-18.tsv' |> 
      paste_wd()
  )
## Rows: 36717 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (11): transcript_id, transcript_type, transcript_name, gene_id, gene_typ...
## dbl (18): siMETTL2A_baseMean, siMETTL2A_log2FoldChange, siMETTL2A_lfcSE, siM...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_deseq2_genetype2_isDET
## # A tibble: 36,717 × 29
##    transcript_id     transcript_type transcript_name gene_id gene_type gene_name
##    <chr>             <chr>           <chr>           <chr>   <chr>     <chr>    
##  1 ENST00000498442.1 retained_intron CRBN-212        ENSG00… protein_… CRBN     
##  2 ENST00000459840.5 retained_intron CRBN-205        ENSG00… protein_… CRBN     
##  3 ENST00000231948.9 protein_coding  CRBN-201        ENSG00… protein_… CRBN     
##  4 ENST00000432408.6 protein_coding  CRBN-203        ENSG00… protein_… CRBN     
##  5 ENST00000339437.… protein_coding  TRNT1-203       ENSG00… protein_… TRNT1    
##  6 ENST00000488263.5 retained_intron CRBN-209        ENSG00… protein_… CRBN     
##  7 ENST00000420393.5 protein_coding  TRNT1-207       ENSG00… protein_… TRNT1    
##  8 ENST00000698415.1 retained_intron TRNT1-230       ENSG00… protein_… TRNT1    
##  9 ENST00000450014.1 protein_coding  CRBN-204        ENSG00… protein_… CRBN     
## 10 ENST00000698416.1 retained_intron TRNT1-231       ENSG00… protein_… TRNT1    
## # ℹ 36,707 more rows
## # ℹ 23 more variables: siMETTL2A_baseMean <dbl>,
## #   siMETTL2A_log2FoldChange <dbl>, siMETTL2A_lfcSE <dbl>,
## #   siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>, siMETTL2A_padj <dbl>,
## #   siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## #   siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, siMETTL2A_I_pvalue <dbl>,
## #   siMETTL2A_I_padj <dbl>, siMETTL2A_G_baseMean <dbl>, …
sampcomp_results_joined <- 
  read_tsv(
    'Tables/DRS_m3C_sites/sampcomp_results_joined_2024-04-24.tsv.gz' |> 
      paste_wd()
  )
## Rows: 5884004 Columns: 67
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (34): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (33): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sampcomp_results_joined
## # A tibble: 5,884,004 × 67
##    transcript_id     transcript_name position ref_kmer GMM_logit_pvalue_G
##    <chr>             <chr>              <dbl> <chr>                 <dbl>
##  1 ENST00000264926.7 RAD18-201           1464 TCACA                    NA
##  2 ENST00000264926.7 RAD18-201           1465 CACAT                     1
##  3 ENST00000264926.7 RAD18-201           1466 ACATA                    NA
##  4 ENST00000264926.7 RAD18-201           1467 CATAA                     1
##  5 ENST00000264926.7 RAD18-201           1468 ATAAA                    NA
##  6 ENST00000264926.7 RAD18-201           1473 AACGA                     1
##  7 ENST00000264926.7 RAD18-201           1475 CGATC                    NA
##  8 ENST00000264926.7 RAD18-201           1486 ACACA                    NA
##  9 ENST00000264926.7 RAD18-201           1501 CAAGA                     1
## 10 ENST00000264926.7 RAD18-201           1502 AAGAC                    NA
## # ℹ 5,883,994 more rows
## # ℹ 62 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## #   GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## #   Logit_LOR_G <chr>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## #   c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## #   c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## #   c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
colnames(sampcomp_results_joined)
##  [1] "transcript_id"         "transcript_name"       "position"             
##  [4] "ref_kmer"              "GMM_logit_pvalue_G"    "KS_dwell_pvalue_G"    
##  [7] "KS_intensity_pvalue_G" "GMM_cov_type_G"        "GMM_n_clust_G"        
## [10] "cluster_counts_G"      "Logit_LOR_G"           "c1_mean_intensity_G"  
## [13] "c2_mean_intensity_G"   "c1_median_intensity_G" "c2_median_intensity_G"
## [16] "c1_sd_intensity_G"     "c2_sd_intensity_G"     "c1_mean_dwell_G"      
## [19] "c2_mean_dwell_G"       "c1_median_dwell_G"     "c2_median_dwell_G"    
## [22] "c1_sd_dwell_G"         "c2_sd_dwell_G"         "intensity_up_G"       
## [25] "intensity_down_G"      "dwell_up_G"            "dwell_down_G"         
## [28] "GMM_change_G"          "GMM_logit_pvalue_I"    "KS_dwell_pvalue_I"    
## [31] "KS_intensity_pvalue_I" "GMM_cov_type_I"        "GMM_n_clust_I"        
## [34] "cluster_counts_I"      "Logit_LOR_I"           "c1_mean_intensity_I"  
## [37] "c2_mean_intensity_I"   "c1_median_intensity_I" "c2_median_intensity_I"
## [40] "c1_sd_intensity_I"     "c2_sd_intensity_I"     "c1_mean_dwell_I"      
## [43] "c2_mean_dwell_I"       "c1_median_dwell_I"     "c2_median_dwell_I"    
## [46] "c1_sd_dwell_I"         "c2_sd_dwell_I"         "intensity_up_I"       
## [49] "intensity_down_I"      "dwell_up_I"            "dwell_down_I"         
## [52] "GMM_change_I"          "intensity_up"          "intensity_down"       
## [55] "dwell_up"              "dwell_down"            "GMM_change"           
## [58] "middle_base"           "middle_isC"            "have_CC_middle"       
## [61] "have_C3_middle"        "middleC_info"          "transcript_type"      
## [64] "gene_id"               "gene_type"             "gene_name"            
## [67] "seqname"

Compare among groups

sampcomp_results_intensity_group <- 
  sampcomp_results_joined |> 
  select(
    transcript_id:ref_kmer, 
    intensity_up, intensity_down, 
    #contains('_median_intensity_'), contains('_mean_intensity_')
  ) |> 
  group_by(transcript_id, transcript_name) |> 
  reframe(
    intensity_up_group   = paste(unique(intensity_up), collapse = ','),
    intensity_down_group = paste(unique(intensity_down), collapse = ','),
  )  |> 
  mutate(
    intensity_group = case_when(
      grepl('common', intensity_up_group) & grepl('common', intensity_down_group) ~
        'both up and down',
      grepl('common', intensity_up_group) & !grepl('common', intensity_down_group) ~
        'up',
      !grepl('common', intensity_up_group) & grepl('common', intensity_down_group) ~
        'down',
      .default = 'others'
    )
  ) 
sampcomp_results_intensity_group
## # A tibble: 5,297 × 5
##    transcript_id      transcript_name intensity_up_group intensity_down_group
##    <chr>              <chr>           <chr>              <chr>               
##  1 ENST00000000233.10 ARF5-201        others             others              
##  2 ENST00000000412.8  M6PR-201        others             others              
##  3 ENST00000000442.11 ESRRA-201       others             others              
##  4 ENST00000001008.6  FKBP4-201       others             others              
##  5 ENST00000002165.11 FUCA2-201       others             others              
##  6 ENST00000003100.13 CYP51A1-201     others,only I      others              
##  7 ENST00000004103.8  TMEM176A-201    others             others              
##  8 ENST00000005257.7  RALA-201        others             others              
##  9 ENST00000005260.9  BAIAP2L1-201    others             others              
## 10 ENST00000005386.8  RPAP3-201       others             others              
## # ℹ 5,287 more rows
## # ℹ 1 more variable: intensity_group <chr>
sampcomp_results_intensity_group |> 
  group_by(intensity_group) |> 
  reframe(n = n())
## # A tibble: 4 × 2
##   intensity_group      n
##   <chr>            <int>
## 1 both up and down     6
## 2 down                 2
## 3 others            5210
## 4 up                  79
espresso_deseq2_genetype2_isDET_intensitygroup <- 
  espresso_deseq2_genetype2_isDET |> 
  full_join(sampcomp_results_intensity_group)
## Joining with `by = join_by(transcript_id, transcript_name)`
espresso_deseq2_genetype2_isDET_intensitygroup
## # A tibble: 36,717 × 32
##    transcript_id     transcript_type transcript_name gene_id gene_type gene_name
##    <chr>             <chr>           <chr>           <chr>   <chr>     <chr>    
##  1 ENST00000498442.1 retained_intron CRBN-212        ENSG00… protein_… CRBN     
##  2 ENST00000459840.5 retained_intron CRBN-205        ENSG00… protein_… CRBN     
##  3 ENST00000231948.9 protein_coding  CRBN-201        ENSG00… protein_… CRBN     
##  4 ENST00000432408.6 protein_coding  CRBN-203        ENSG00… protein_… CRBN     
##  5 ENST00000339437.… protein_coding  TRNT1-203       ENSG00… protein_… TRNT1    
##  6 ENST00000488263.5 retained_intron CRBN-209        ENSG00… protein_… CRBN     
##  7 ENST00000420393.5 protein_coding  TRNT1-207       ENSG00… protein_… TRNT1    
##  8 ENST00000698415.1 retained_intron TRNT1-230       ENSG00… protein_… TRNT1    
##  9 ENST00000450014.1 protein_coding  CRBN-204        ENSG00… protein_… CRBN     
## 10 ENST00000698416.1 retained_intron TRNT1-231       ENSG00… protein_… TRNT1    
## # ℹ 36,707 more rows
## # ℹ 26 more variables: siMETTL2A_baseMean <dbl>,
## #   siMETTL2A_log2FoldChange <dbl>, siMETTL2A_lfcSE <dbl>,
## #   siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>, siMETTL2A_padj <dbl>,
## #   siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## #   siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, siMETTL2A_I_pvalue <dbl>,
## #   siMETTL2A_I_padj <dbl>, siMETTL2A_G_baseMean <dbl>, …
espresso_deseq2_genetype2_isDET_intensitygroup |> 
  ggplot(aes(y = siMETTL2A_log2FoldChange, x = intensity_group)) +
  #stat_ecdf() +
  geom_boxplot() +
  geom_hline(yintercept = 0)
## Warning: Removed 340 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Correlation

sampcomp_results_intensity <- 
  sampcomp_results_joined |> 
  select(
    transcript_id:ref_kmer, 
    intensity_up, intensity_down, 
    contains('_median_intensity_'), contains('_mean_intensity_')
  ) |> # c1: ctrl, C2: KD 
  mutate(
    median_intensity_diff_G = c2_median_intensity_G - c1_median_intensity_G,
    median_intensity_diff_I = c2_median_intensity_I - c1_median_intensity_G
  )
sampcomp_results_intensity
## # A tibble: 5,884,004 × 16
##    transcript_id   transcript_name position ref_kmer intensity_up intensity_down
##    <chr>           <chr>              <dbl> <chr>    <chr>        <chr>         
##  1 ENST0000026492… RAD18-201           1464 TCACA    others       others        
##  2 ENST0000026492… RAD18-201           1465 CACAT    others       others        
##  3 ENST0000026492… RAD18-201           1466 ACATA    others       others        
##  4 ENST0000026492… RAD18-201           1467 CATAA    others       others        
##  5 ENST0000026492… RAD18-201           1468 ATAAA    others       others        
##  6 ENST0000026492… RAD18-201           1473 AACGA    others       others        
##  7 ENST0000026492… RAD18-201           1475 CGATC    others       others        
##  8 ENST0000026492… RAD18-201           1486 ACACA    others       others        
##  9 ENST0000026492… RAD18-201           1501 CAAGA    others       others        
## 10 ENST0000026492… RAD18-201           1502 AAGAC    others       others        
## # ℹ 5,883,994 more rows
## # ℹ 10 more variables: c1_median_intensity_G <dbl>,
## #   c2_median_intensity_G <dbl>, c1_median_intensity_I <dbl>,
## #   c2_median_intensity_I <dbl>, c1_mean_intensity_G <dbl>,
## #   c2_mean_intensity_G <dbl>, c1_mean_intensity_I <dbl>,
## #   c2_mean_intensity_I <dbl>, median_intensity_diff_G <dbl>,
## #   median_intensity_diff_I <dbl>
espresso_deseq2_genetype2_isDET_intensitydiff <- 
  espresso_deseq2_genetype2_isDET |> 
  full_join(sampcomp_results_intensity)
## Joining with `by = join_by(transcript_id, transcript_name)`
espresso_deseq2_genetype2_isDET_intensitydiff
## # A tibble: 5,915,424 × 43
##    transcript_id     transcript_type transcript_name gene_id gene_type gene_name
##    <chr>             <chr>           <chr>           <chr>   <chr>     <chr>    
##  1 ENST00000498442.1 retained_intron CRBN-212        ENSG00… protein_… CRBN     
##  2 ENST00000459840.5 retained_intron CRBN-205        ENSG00… protein_… CRBN     
##  3 ENST00000231948.9 protein_coding  CRBN-201        ENSG00… protein_… CRBN     
##  4 ENST00000432408.6 protein_coding  CRBN-203        ENSG00… protein_… CRBN     
##  5 ENST00000339437.… protein_coding  TRNT1-203       ENSG00… protein_… TRNT1    
##  6 ENST00000488263.5 retained_intron CRBN-209        ENSG00… protein_… CRBN     
##  7 ENST00000420393.5 protein_coding  TRNT1-207       ENSG00… protein_… TRNT1    
##  8 ENST00000698415.1 retained_intron TRNT1-230       ENSG00… protein_… TRNT1    
##  9 ENST00000450014.1 protein_coding  CRBN-204        ENSG00… protein_… CRBN     
## 10 ENST00000698416.1 retained_intron TRNT1-231       ENSG00… protein_… TRNT1    
## # ℹ 5,915,414 more rows
## # ℹ 37 more variables: siMETTL2A_baseMean <dbl>,
## #   siMETTL2A_log2FoldChange <dbl>, siMETTL2A_lfcSE <dbl>,
## #   siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>, siMETTL2A_padj <dbl>,
## #   siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## #   siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, siMETTL2A_I_pvalue <dbl>,
## #   siMETTL2A_I_padj <dbl>, siMETTL2A_G_baseMean <dbl>, …
espresso_deseq2_genetype2_isDET_intensitydiff |> 
  ggplot(aes(
    x = median_intensity_diff_G, y = siMETTL2A_G_log2FoldChange
  )) +
  geom_hex(bins = 100) +
  stat_smooth(method = 'lm') +
  geom_vline(xintercept = c(0), alpha = 1/5) +
  geom_hline(yintercept = c(0), alpha = 1/5) +
  scale_fill_viridis_c(trans = 'log10') 
## Warning: Removed 64846 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 64846 rows containing non-finite outside the scale range
## (`stat_smooth()`).

espresso_deseq2_genetype2_isDET_intensitydiff |> 
  plot_2dhistogram_withcortest(
    x = median_intensity_diff_G, y = siMETTL2A_G_log2FoldChange, 
    n_bins = 100
  )
## Warning in cor.test.default(x = mf[[1L]], y = mf[[2L]], ...): Cannot compute
## exact p-value with ties
## # A tibble: 2 × 9
##   estimate statistic  p.value method method_short alternative parameter conf.low
##      <dbl>     <dbl>    <dbl> <chr>  <chr>        <chr>           <int>    <dbl>
## 1 -0.00625   3.36e19 1.47e-51 Spear… Spearman     two.sided          NA NA      
## 2 -0.00318  -7.69e 0 1.48e-14 Pears… Pearson      two.sided     5850576 -0.00399
## # ℹ 1 more variable: conf.high <dbl>
## [1] 5850578
## # A tibble: 2 × 1
##   msg                                          
##   <chr>                                        
## 1 Spearman: r = -0.01, p < 2.2e-16, n = 5850578
## 2 Pearson: r = 0, p = 1.48e-14, n = 5850578

espresso_deseq2_genetype2_isDET_intensitydiff |> 
  plot_2dhistogram_withcortest(
    x = median_intensity_diff_I, y = siMETTL2A_I_log2FoldChange, 
    n_bins = 100
  )
## Warning in cor.test.default(x = mf[[1L]], y = mf[[2L]], ...): Cannot compute
## exact p-value with ties
## # A tibble: 2 × 9
##   estimate statistic  p.value method method_short alternative parameter conf.low
##      <dbl>     <dbl>    <dbl> <chr>  <chr>        <chr>           <int>    <dbl>
## 1 -0.0109    5.61e18 1.93e-85 Spear… Spearman     two.sided          NA NA      
## 2 -0.00547  -9.81e 0 1.00e-22 Pears… Pearson      two.sided     3217023 -0.00656
## # ℹ 1 more variable: conf.high <dbl>
## [1] 3217025
## # A tibble: 2 × 1
##   msg                                          
##   <chr>                                        
## 1 Spearman: r = -0.01, p < 2.2e-16, n = 3217025
## 2 Pearson: r = -0.01, p < 2.2e-16, n = 3217025

plot_intensity_coverage_change_correlation <- function(df, .x, .y) {
  
  df |> 
    ggplot(aes(
      x = {{.x}}, y = {{.y}}
    )) +
    geom_hex(bins = 100) +
#    stat_smooth(method = 'lm') +
    geom_vline(xintercept = c(0), alpha = 1/5) +
    geom_hline(yintercept = c(0), alpha = 1/5) +
    scale_x_continuous(limits = c(-20, 20)) +
    scale_y_continuous(limits = c(-10, 10)) +
    scale_fill_viridis_c(trans = 'log10') 
  
}


currentintensity_coverage_correlation_G <- 
  espresso_deseq2_genetype2_isDET_intensitydiff |> 
  plot_intensity_coverage_change_correlation(
    .x = median_intensity_diff_G, .y = siMETTL2A_G_log2FoldChange
  )
currentintensity_coverage_correlation_G |> 
  ggsave_pdf(
    width = 5, height = 6, outdir = figdir
  )
## Warning: Removed 66072 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## Warning: Removed 66072 rows containing non-finite outside the scale range
## (`stat_binhex()`).

currentintensity_coverage_correlation_I <- 
  espresso_deseq2_genetype2_isDET_intensitydiff |> 
  plot_intensity_coverage_change_correlation(
    .x = median_intensity_diff_I, .y = siMETTL2A_I_log2FoldChange
  )
currentintensity_coverage_correlation_I |> 
  ggsave_pdf(
    width = 5, height = 6, outdir = figdir
  )
## Warning: Removed 2698411 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## Warning: Removed 2698411 rows containing non-finite outside the scale range
## (`stat_binhex()`).

m3C_sites <- 
  sampcomp_results_joined |> 
  filter(intensity_up == 'common') |> 
  filter(middle_base == 'C')
m3C_sites
## # A tibble: 489 × 67
##    transcript_id     transcript_name position ref_kmer GMM_logit_pvalue_G
##    <chr>             <chr>              <dbl> <chr>                 <dbl>
##  1 ENST00000429711.7 RPL32-204            422 GCCCA                 1    
##  2 ENST00000647248.2 RPL35A-211           380 ACCCC                 1    
##  3 ENST00000647248.2 RPL35A-211           381 CCCCT                 1    
##  4 ENST00000389680.2 MT-RNR1-201           57 CCCCG                 1    
##  5 ENST00000389680.2 MT-RNR1-201           75 ACCCT                 0.777
##  6 ENST00000389680.2 MT-RNR1-201           93 ATCAA                 1    
##  7 ENST00000389680.2 MT-RNR1-201          148 GCCAC                 1    
##  8 ENST00000389680.2 MT-RNR1-201          153 ACCCC                 1    
##  9 ENST00000389680.2 MT-RNR1-201          154 CCCCC                 1    
## 10 ENST00000389680.2 MT-RNR1-201          155 CCCCA                 1    
## # ℹ 479 more rows
## # ℹ 62 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## #   GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## #   Logit_LOR_G <chr>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## #   c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## #   c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## #   c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
m3C_RNAs <- 
  m3C_sites |> 
  select(transcript_id, transcript_name) |> 
  distinct() |> 
  left_join(sampcomp_results_joined)
## Joining with `by = join_by(transcript_id, transcript_name)`
m3C_RNAs 
## # A tibble: 60,206 × 67
##    transcript_id     transcript_name position ref_kmer GMM_logit_pvalue_G
##    <chr>             <chr>              <dbl> <chr>                 <dbl>
##  1 ENST00000429711.7 RPL32-204             30 TCCTC                    NA
##  2 ENST00000429711.7 RPL32-204             31 CCTCG                     1
##  3 ENST00000429711.7 RPL32-204             32 CTCGG                     1
##  4 ENST00000429711.7 RPL32-204             33 TCGGC                     1
##  5 ENST00000429711.7 RPL32-204             34 CGGCG                     1
##  6 ENST00000429711.7 RPL32-204             35 GGCGC                     1
##  7 ENST00000429711.7 RPL32-204             36 GCGCT                     1
##  8 ENST00000429711.7 RPL32-204             37 CGCTG                     1
##  9 ENST00000429711.7 RPL32-204             38 GCTGC                     1
## 10 ENST00000429711.7 RPL32-204             39 CTGCC                     1
## # ℹ 60,196 more rows
## # ℹ 62 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## #   GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## #   Logit_LOR_G <chr>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## #   c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## #   c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## #   c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
m3C_RNAs_allsites_intensity_group <- 
  m3C_RNAs |> 
  group_by(intensity_up, intensity_down) |> 
  reframe(n = n()) |> 
  mutate(
    intensity_group = case_when(
      intensity_up == 'common' ~ 'up',
      intensity_down == 'common' ~ 'down',
      .default = 'others'
    )
  )
m3C_RNAs_allsites_intensity_group
## # A tibble: 9 × 4
##   intensity_up intensity_down     n intensity_group
##   <chr>        <chr>          <int> <chr>          
## 1 common       others           588 up             
## 2 only G       only I             5 others         
## 3 only G       others          1757 others         
## 4 only I       only G             1 others         
## 5 only I       others           322 others         
## 6 others       common            19 down           
## 7 others       only G            39 others         
## 8 others       only I           166 others         
## 9 others       others         57309 others
add_yrange <- function(df) {
  
  new_df <-  df |> 
    mutate(ymax = cumsum(percentage / 100))
  new_df$ymin <- c(0, head(new_df$ymax, n = -1))
  return(new_df)
  
}


donutplot <- function(df, var, col, color_values) {
  
  df |> 
    add_yrange() |> 
    ggplot(aes(
      xmin = 3, xmax = 4, ymin = ymin, ymax = ymax,
      fill = {{ col }}, #colour = {{ col }}         
    )) +
    geom_rect() +
    coord_polar(theta = 'y') +  
    ggrepel::geom_text_repel(
      aes(label = {{ var }}, y = (ymin + ymax) / 2), x = 1
    ) +
    scale_fill_manual(values = color_values) +
    xlim(c(-1,4)) +
    theme_void() 
  
}

percentage_intensity_up_sites_in_m3CRNAs <- 
  m3C_RNAs_allsites_intensity_group |> 
  arrange(intensity_group) |> 
  mutate(percentage = 100 * n / sum(n)) |>
  # group_by(intensity_group) |> 
  # reframe(sum_percentage = sum(percentage))
  donutplot(
    var = intensity_group, col = intensity_group, 
    color_values = c('#0000aa', '#999999', '#aa0000')
  ) 
percentage_intensity_up_sites_in_m3CRNAs |> 
  ggsave_pdf(width = 6, height = 4, outdir = figdir)
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps